Code
library(tidyverse)library(tidyverse)Read in the full project structure from the project file to map experiment names to numbers.
proj <- jsonlite::read_json("data/project_1136_structure.json")
exp_data <- purrr::map_df(proj, \(comp) {
if (comp$component_type == "exp") {
data.frame(
exp_id = comp$id,
name = comp$name,
res_name = comp$res_name,
instructions = comp$instructions,
question = comp$question,
exptype = comp$exptype,
trial_order = comp$trial_order,
total_stim = comp$total_stim,
random_stim = comp$random_stim,
trials = length(comp$trial),
stim = length(comp$stim)
)
} else {
NULL
}
}) |>
mutate(exp = sub("ManyFaces? Pilot Ratings: ", "", res_name) |> trimws())
trial_data <- purrr::map_df(proj, \(comp) {
if (comp$component_type == "exp") {
purrr::map_df(comp$trial, \(trial) {
data.frame(
exp_id = comp$id,
n = trial$trial_n,
name = trial$name,
img_id = trial$center_img,
img_path = comp$stimuli[[as.character(trial$center_img)]]
)
})
} else {
NULL
}
}) |>
mutate(name = sub("^(manyfaces|attention_checks)/", "", name))exp_data |>
select(exp_id, exp, question, trials) |>
arrange(exp)This workflow requires the data-raw directory, which is not shared on github.
This is the SQL for downloading the data from Experimentum. We need to download in chunks of 50000 rows to avoid file download limits on the site (not needed if downloading directly from SQL).
SELECT
session.id as session_id, project_id, exp.res_name as exp_name, exp_id,
session.user_id, user.sex as user_sex, user.status as user_status,
ROUND(DATEDIFF(ed.dt, REPLACE(birthday, "-00","-01"))/365.25, 1) AS user_age,
trial.name as trial_name,
trial_n,
`order`,
dv,
rt,
ed.side,
ed.dt
FROM session
LEFT JOIN user USING (user_id)
LEFT JOIN exp_data AS ed ON ed.session_id = session.id
LEFT JOIN exp ON exp.id = ed.exp_id
LEFT JOIN trial USING (exp_id, trial_n)
WHERE session.project_id = 1136
AND user.status IN ("guest", "registered")
AND exp_id IN (1384, 1400, 1399, 1398, 1401, 1402, 1403,
1404, 1405, 1397, 1390, 1389, 1388, 1387,
1386, 1385, 1382, 1381, 1380, 1379, 1377)
LIMIT 50000
OFFSET 0# combine multiple downloads into one file
exp_raw <- list.files("data-raw/exp", full.names = TRUE) |>
read_csv(show_col_types = FALSE) |>
unique() |>
filter(user_status %in% c("guest", "registered"))
write_csv(exp_raw, paste0("data-raw/ManyFaces-Pilot-Ratings-exps_", Sys.Date(), ".csv"))
# get most recent files
exp_file <- list.files("data-raw", "ManyFaces-Pilot-Ratings-exps",
full.names = TRUE) |>
sort(decreasing = TRUE) |>
pluck(1)
exp_raw <- read_csv(exp_file, show_col_types = FALSE) |>
filter(user_status %in% c("guest", "registered")) |>
unique()
# get most recent files
quest_file <- list.files("data-raw", "ManyFaces-Pilot-Ratings-quests",
full.names = TRUE) |>
sort(decreasing = TRUE) |>
pluck(1)
quest_raw <- read_csv(quest_file, show_col_types = FALSE) |>
filter(user_status %in% c("guest", "registered")) |>
unique()
# write to the data directory
write_csv(exp_raw, "data/manyfaces-pilot-exp.csv")
write_csv(quest_raw, "data/manyfaces-pilot-quest.csv")exp_raw <- read_csv("data/manyfaces-pilot-exp.csv", show_col_types = FALSE)
quest_raw <- read_csv("data/manyfaces-pilot-quest.csv", show_col_types = FALSE)ed <- exp_data |>
select(exp_id, exp, trials) |>
rename(trials_total = trials)
exp_long_raw <- exp_raw |>
select(session_id, exp_id, trial_name, dv, rt, dt) |>
unique() |>
mutate(trial_name = sub("^(manyfaces|attention_checks)/", "", trial_name)) |>
left_join(ed, by = "exp_id")Remove remove duplicate trials and incomplete data
We start with 2115 participants completing a total of 2158 experiments.
Some participants completed more than the maximum amount of trials in an experiment. Duplicate trials were removed by only retaining participants’ initial ratings of a particular stimulus.
# NOTE: this is not elegant as it takes QUITE long to run. I initially just did the slicing for the subsection of IDs that I knew contained duplicates, but I felt this made code unnecessarily hard to follow. Happy to change back if running time is a concern!
dupl_trials_rm <- exp_long_raw |>
group_by(session_id, exp_id, trial_name, trials_total) |>
slice_min(dt, n = 1) |>
ungroup()incomplete <- dupl_trials_rm |>
count(session_id, exp_id, trials_total) |>
filter(n < trials_total)
complete <- dupl_trials_rm |>
filter(!session_id %in% incomplete$session_id)190 participants did not complete all trials of an experiment and were excluded.
Some participants also completed more than one experiment. Data participants provided in a second rating study was removed.
dupl_exp_ids <- complete |>
# Mark participants that participated twice and establish which exp they did first
group_by(session_id, exp_id) |>
summarise(first_dt = min(dt), .groups = "drop") |>
group_by(session_id) |>
mutate(n_exp = n()) |>
mutate(first_exp = exp_id[which.min(first_dt)]) |>
ungroup() |>
filter(n_exp > 1 & exp_id != first_exp)exp_long <- complete |>
anti_join(dupl_exp_ids, by = c("session_id", "exp_id" = "first_exp"))Prior to pre-registered exclusions, preliminary sample thus consisted of 1936 participants.
Participants were excluded based on overly consistent responding, i.e. if they responded to at least 90% of trials identically.
overly_consistent <- exp_long |>
summarise(
same_pcnt = max(tabulate(match(dv, unique(dv)))) / n(),
.by = c(session_id, exp_id)) |>
dplyr::select(session_id, exp_id, same_pcnt) |>
filter(same_pcnt >= 0.90)Participants were excluded based on overly fast responding, i.e. if their median reaction time fell below the 1st percentile of the overall distribution of median reaction times.
med_rt <- exp_long |>
summarise(med_rt = median(rt),
.by = c(session_id, exp_id))
overly_fast <- med_rt |>
filter(med_rt < quantile(med_rt, probs = 0.01))med_rt |>
ggplot(aes(x = med_rt)) +
geom_histogram(bins = 100) +
geom_vline(xintercept = quantile(med_rt$med_rt, probs = 0.01),
color = "red", linetype = "dashed", linewidth = 1) +
scale_x_continuous(breaks = seq(0, 10000, 500)) +
labs(x = "Median Reaction Time (ms)") +
theme_bw()Participants were excluded based on self-reported honesty check, i.e. if they responded not taking the study seriously vs. taking it authentically
honesty_check_failed <- quest_raw |>
filter(q_name == "try") |>
select(session_id, honesty_check = dv) |>
unique() |>
filter(honesty_check != 2) |>
mutate(exp_id = NA) |>
relocate(exp_id, .after = session_id)Participants were excluded if they failed two or more attention checks
attn_checks_failed <- exp_long |>
select(session_id:dv) |>
filter(grepl("check", trial_name)) |>
mutate(check_type = sub("check_[a-z0-9-]+_", "", trial_name),
check_type = ifelse(exp_id == 1400, substr(check_type, 4, 6), check_type)) |>
summarise(attn_checks_passed = mean(check_type == dv),
.by = c("session_id", "exp_id")) |>
filter(attn_checks_passed < 5/7)exclusions <- overly_consistent |>
full_join(overly_fast, by = c("session_id", "exp_id")) |>
full_join(honesty_check_failed, by = c("session_id", "exp_id")) |>
full_join(attn_checks_failed, by = c("session_id", "exp_id"))exclusions |>
summarise(
`>90% same response` = sum(!is.na(same_pcnt)),
`RT below cutoff` = sum(!is.na(med_rt)),
`Failed honesty check` = sum(!is.na(honesty_check)),
`Failed 2 or more of 7 attention checks` = sum(!is.na(attn_checks_passed)),
"Total exclusions" = n()
) |>
pivot_longer(
cols = everything(),
names_to = "Reason for exclusion",
values_to = "Number of participants"
)Exclude 29 participants and remove attention checks from data.
exp <- exp_long |>
anti_join(exclusions, by = c("session_id", "exp_id")) |>
filter(!grepl("check_", trial_name))After exclusions, there were 1909 participants.
Number of participants per study:
exp |>
summarise(.by = c(exp, session_id)) |>
count(exp)endtimes <- quest_raw |>
summarise(end = max(endtime), .by = c(session_id))
times <- exp |>
summarise(start = min(dt), .by = c(session_id)) |>
left_join(endtimes, by = "session_id") |>
mutate(duration = interval(start, end) |> as.numeric("minutes"))quest <- quest_raw |>
anti_join(exclusions, by = c("session_id")) |>
select(session_id, q_name, dv, endtime) |>
unique() |>
pivot_wider(names_from = q_name, values_from = dv) |>
mutate(age = as.integer(age))ggplot(quest, aes(x = age, fill = gender)) +
geom_histogram(binwidth = 1) +
scale_fill_manual(values = c("hotpink", "lightblue", "orchid", "gray"))count(quest, residence, sort = TRUE)quest |>
mutate(ethnicity = tolower(ethnicity)) |>
count(ethnicity, sort = TRUE)count(quest, device, sort = TRUE)rainbow <- c("firebrick", "darkorange", "goldenrod", "darkgreen", "dodgerblue3", "darkorchid4")exp_levels <- c("attractive", "trustworthy", "dominant",
"memorable", "gender-typical")
exp |>
filter(exp_id %in% 1377:1382) |>
mutate(dv = as.integer(dv),
exp = factor(exp, exp_levels)) |>
ggplot(aes(x = dv, fill = exp)) +
geom_histogram(binwidth = 1, show.legend = FALSE, color = "black") +
facet_wrap(~exp, ncol = 3, axes = "all_x", drop = FALSE) +
labs(title = "Standardised Neutral Ratings",
x = "") +
scale_x_continuous(breaks = 1:7) +
scale_fill_manual(values = rainbow, drop = FALSE)# function to create heatmap visualisations
heatmap <- function(id, label) {
exp |>
filter(exp %in% id) |>
separate(trial_name, c("lab", "id"), extra = "drop") |>
count(lab, id, dv) |>
ggplot(aes(x = dv, y = id, fill = n)) +
geom_tile() +
facet_wrap(~lab) +
scale_fill_viridis_c() +
labs(x = label, y = NULL,
title = paste(label, "Ratings")) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
}heatmap("attractive", "Attractiveness")heatmap("trustworthy", "Trustworthiness")heatmap("dominant", "Dominance")heatmap("memorable", "Memorableness")heatmap("gender-typical", "Gender Typicality")exp_labels <- c("attractive", "trustworthy", "dominant")
exp_levels <- paste(exp_labels, "(unstd)")
exp |>
filter(exp_id %in% 1397:1399) |>
mutate(dv = as.integer(dv),
exp = factor(exp, exp_levels, exp_labels)) |>
ggplot(aes(x = dv, fill = exp)) +
geom_histogram(binwidth = 1, show.legend = FALSE, color = "black") +
facet_wrap(~exp, ncol = 5, drop = FALSE) +
labs(title = "Unstandardised Neutral Ratings",
x = "") +
scale_x_continuous(breaks = 1:7) +
scale_fill_manual(values = rainbow, drop = FALSE)heatmap("attractive (unstd)", "Attractiveness (Unstandardised)")heatmap("trustworthy (unstd)", "Trustworthiness (Unstandardised)")heatmap("dominant (unstd)", "Dominance (Unstandardised)")dv_levels <- c("anger", "disgust", "fear",
"happiness", "sadness", "surprise", "other")
emo_levels <- c("ang", "dis", "fea", "hap", "sad", "sur")
emo_labels <- paste(dv_levels[1:6], "faces")
exp |>
filter(exp_id %in% c(1384, 1401:1405)) |>
separate(trial_name, c("lab", "model", "type", "emo", "view")) |>
mutate(dv = factor(dv, dv_levels),
emo = factor(emo, emo_levels, emo_labels)) |>
ggplot(aes(x = dv, fill = dv)) +
geom_point(aes(x = x, colour = I(fill), fill = I(fill)),
data.frame(emo = factor(emo_levels, emo_levels, emo_labels),
x = 1:6,
fill = rainbow),
size = 6.5, y = -60, shape = 18, show.legend = FALSE) +
geom_bar(color = "transparent") +
facet_wrap(~emo, axes = "all_x", drop = FALSE) +
scale_x_discrete(labels = c("A", "D", "F", "H", "S", "U", "O")) +
scale_fill_manual(values = c(rainbow, "grey"), drop = FALSE) +
labs(title = "Emotion Ratings",
x = "",
fill = "Rated Emotion") +
coord_cartesian(clip="off") +
theme(axis.ticks.x = element_blank())exp_levels <- c("anger", "disgust", "fear",
"happiness", "sadness", "surprise")
exp |>
filter(exp_id %in% 1385:1390) |>
mutate(dv = as.integer(dv),
exp = factor(exp, exp_levels)) |>
ggplot(aes(x = dv, fill = exp)) +
geom_histogram(binwidth = 1, show.legend = FALSE, color = "black") +
facet_wrap(~exp, ncol = 3, axes = "all_x", drop = FALSE) +
labs(title = "Emotion Intensity Ratings",
x = "") +
scale_fill_manual(values = rainbow, drop = FALSE) +
scale_x_continuous(breaks = 1:7)dv_levels <- seq(20, 85, 5)
dv_labels <- paste(dv_levels-4, "-", dv_levels )
dv_labels[14] <- "81+"
exp |>
filter(exp_id %in% 1400) |>
mutate(dv = factor(dv, dv_levels, dv_labels)) |>
ggplot(aes(x = dv)) +
geom_bar(color = "black", fill = "white") +
scale_x_discrete(drop = FALSE) +
labs(title = "Age Ratings",
x = "")# exp |>
# filter(exp_id %in% 1400) |>
# mutate(dv = as.numeric(dv) - 2.5) |>
# summarise(age = mean(dv), age_sd = sd(dv), .by = trial_name)exp |>
filter(exp_id %in% 1400) |>
mutate(dv = factor(dv, dv_levels, dv_labels)) |>
mutate(trial_name = gsub("_std_neu_0", "", trial_name)) |>
separate(trial_name, c("lab", "id")) |>
count(lab, id, dv) |>
ggplot(aes(x = dv, y = id, fill = n)) +
geom_tile() +
facet_wrap(~lab) +
scale_fill_viridis_c() +
labs(x = "Age", y = NULL) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))